import pandas as pd
import datetime
import plotly.express as px
from plotly import graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st
transactions=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=1)
new_customers=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=2)
customer_demographic=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=3)
customer_address=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=4)
C:\Users\sam19\AppData\Local\Temp\ipykernel_11228\1701482122.py:2: FutureWarning: Inferring datetime64[ns] from data containing strings is deprecated and will be removed in a future version. To retain the old behavior explicitly pass Series(data, dtype=datetime64[ns])
new_customers=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=2)
C:\Users\sam19\AppData\Local\Temp\ipykernel_11228\1701482122.py:3: FutureWarning: Inferring datetime64[ns] from data containing strings is deprecated and will be removed in a future version. To retain the old behavior explicitly pass Series(data, dtype=datetime64[ns])
customer_demographic=pd.read_excel('KPMG_VI_New_rawdata_update_final.xlsx', sheet_name=3)
transactions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20000 entries, 0 to 19999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 transaction_id 20000 non-null float64 1 product_id 20000 non-null float64 2 customer_id 20000 non-null float64 3 transaction_date 20000 non-null datetime64[ns] 4 online_order 19640 non-null float64 5 order_status 20000 non-null object 6 brand 19803 non-null object 7 product_line 19803 non-null object 8 product_class 19803 non-null object 9 product_size 19803 non-null object 10 list_price 20000 non-null float64 11 standard_cost 19803 non-null float64 12 product_first_sold_date 19803 non-null float64 dtypes: datetime64[ns](1), float64(7), object(5) memory usage: 2.0+ MB
# check brands
transactions['brand'].unique()
array(['Solex', 'Trek Bicycles', 'OHM Cycles', 'Norco Bicycles',
'Giant Bicycles', 'WeareA2B', nan], dtype=object)
# convert id to int format
for column in transactions.columns:
if 'id' in column:
transactions[column]=transactions[column].astype('int64')
# convert product_first_sold_date to correct format
transactions['product_first_sold_date']=pd.to_timedelta(transactions['product_first_sold_date'], unit='D') \
+ pd.to_datetime('1899-12-30')
# check unque names in columns
print(transactions['product_line'].unique())
print(transactions['product_class'].unique())
print(transactions['product_size'].unique())
['Standard' 'Road' 'Mountain' 'Touring' nan] ['medium' 'low' 'high' nan] ['medium' 'large' 'small' nan]
# check values in transaction table
transactions.describe()
| transaction_id | product_id | customer_id | online_order | list_price | standard_cost | |
|---|---|---|---|---|---|---|
| count | 20000.000000 | 20000.00000 | 20000.000000 | 19640.000000 | 20000.000000 | 19803.000000 |
| mean | 10000.500000 | 45.36465 | 1738.246050 | 0.500458 | 1107.829449 | 556.046951 |
| std | 5773.647028 | 30.75359 | 1011.951046 | 0.500013 | 582.825242 | 405.955660 |
| min | 1.000000 | 0.00000 | 1.000000 | 0.000000 | 12.010000 | 7.210000 |
| 25% | 5000.750000 | 18.00000 | 857.750000 | 0.000000 | 575.270000 | 215.140000 |
| 50% | 10000.500000 | 44.00000 | 1736.000000 | 1.000000 | 1163.890000 | 507.580000 |
| 75% | 15000.250000 | 72.00000 | 2613.000000 | 1.000000 | 1635.300000 | 795.100000 |
| max | 20000.000000 | 100.00000 | 5034.000000 | 1.000000 | 2091.470000 | 1759.850000 |
Prices vary from 12 to 2091 dollars, values seem to be realistic, no abnormality
# add margin columns to to the table in order to check the most profitable options
transactions['margin']=transactions['list_price']-transactions['standard_cost']
customer_demographic.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4000 entries, 0 to 3999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 4000 non-null float64 1 first_name 4000 non-null object 2 last_name 3875 non-null object 3 gender 4000 non-null object 4 past_3_years_bike_related_purchases 4000 non-null float64 5 DOB 3913 non-null datetime64[ns] 6 job_title 3494 non-null object 7 job_industry_category 3344 non-null object 8 wealth_segment 4000 non-null object 9 deceased_indicator 4000 non-null object 10 default 3698 non-null object 11 owns_car 4000 non-null object 12 tenure 3913 non-null float64 dtypes: datetime64[ns](1), float64(3), object(9) memory usage: 406.4+ KB
# convert float values to int
for column in customer_demographic.columns:
if customer_demographic[column].dtype=='float64':
customer_demographic[column]=customer_demographic[column].astype('int64', errors='ignore')
# Unify gender column
customer_demographic['gender']=(
customer_demographic['gender']
.replace('F', 'Female')
.replace('M', 'Male')
.replace('Femal', 'Female'))
customer_demographic['today']=datetime.datetime.today()
# calculate users age for the current date
customer_demographic['age']=(customer_demographic['today']-customer_demographic['DOB']).apply(lambda x: x.days/365)
try:
customer_demographic_new['age']=round(customer_demographic_new['age'])
except:
pass
customer_demographic.describe()
| customer_id | past_3_years_bike_related_purchases | tenure | age | |
|---|---|---|---|---|
| count | 4000.000000 | 4000.000000 | 3913.000000 | 3913.000000 |
| mean | 2000.500000 | 48.890000 | 10.657041 | 46.131513 |
| std | 1154.844867 | 28.715005 | 5.660146 | 12.804141 |
| min | 1.000000 | 0.000000 | 1.000000 | 21.454795 |
| 25% | 1000.750000 | 24.000000 | 6.000000 | 36.495890 |
| 50% | 2000.500000 | 48.000000 | 11.000000 | 46.106849 |
| 75% | 3000.250000 | 73.000000 | 15.000000 | 55.610959 |
| max | 4000.000000 | 99.000000 | 22.000000 | 179.780822 |
There is user with age 179 years old, that's obviously mistake data, let's see distribtion of age users
fig = plt.figure()
customer_demographic['age'].hist(bins=50)
plt.title('Age distribution')
plt.show()
The main part of users are younger than 70 years, there are few abnormal values more than 80 years
# exclude users which are older 90 years old
customer_demographic=customer_demographic.query('age<90')
new_customers.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 first_name 1000 non-null object 1 last_name 971 non-null object 2 gender 1000 non-null object 3 past_3_years_bike_related_purchases 1000 non-null int64 4 DOB 983 non-null datetime64[ns] 5 job_title 894 non-null object 6 job_industry_category 835 non-null object 7 wealth_segment 1000 non-null object 8 deceased_indicator 1000 non-null object 9 owns_car 1000 non-null object 10 tenure 1000 non-null float64 11 address 1000 non-null object 12 postcode 1000 non-null float64 13 state 1000 non-null object 14 country 1000 non-null object 15 property_valuation 1000 non-null float64 16 Unnamed: 16 1000 non-null float64 17 Unnamed: 17 1000 non-null float64 18 Unnamed: 18 1000 non-null float64 19 Unnamed: 19 1000 non-null float64 20 Unnamed: 20 1000 non-null int64 21 Rank 1000 non-null float64 22 Value 1000 non-null float64 dtypes: datetime64[ns](1), float64(9), int64(2), object(11) memory usage: 179.8+ KB
new_customers['today']=datetime.datetime.today()
# calculate age column for new users
new_customers['age']=(new_customers['today']-new_customers['DOB']).apply(lambda x: x.days/365)
try:
new_customers['age']=round(new_customers['age'])
except:
pass
# creat function for dividing age to age groups
def age_group(age):
if age>21 and age<=35:
return '21-35'
if age>35 and age<=50:
return '36-50'
if age>50 and age<=75:
return '50-75'
else:
return '>75'
# apply age function
new_customers['age_group']=new_customers['age'].apply(age_group)
customer_address.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3999 entries, 0 to 3998 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 3999 non-null float64 1 address 3999 non-null object 2 postcode 3999 non-null float64 3 state 3999 non-null object 4 country 3999 non-null object 5 property_valuation 3999 non-null float64 dtypes: float64(3), object(3) memory usage: 187.6+ KB
# check unique state values
customer_address['state'].unique()
array(['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria'], dtype=object)
# unify state values
customer_address['state']=(
customer_address['state']
.replace('QLD', 'Queensland')
.replace('VIC', 'Victoria')
.replace('NSW', 'New South Wales'))
# join demografic table with customer address table
customer_demographic_new=customer_demographic.merge(customer_address, on='customer_id', how='left')
# apply age function
customer_demographic_new['age_group']=customer_demographic_new['age'].apply(age_group)
customer_demographic_new.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3911 entries, 0 to 3910 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 3911 non-null int64 1 first_name 3911 non-null object 2 last_name 3790 non-null object 3 gender 3911 non-null object 4 past_3_years_bike_related_purchases 3911 non-null int64 5 DOB 3911 non-null datetime64[ns] 6 job_title 3414 non-null object 7 job_industry_category 3255 non-null object 8 wealth_segment 3911 non-null object 9 deceased_indicator 3911 non-null object 10 default 3697 non-null object 11 owns_car 3911 non-null object 12 tenure 3911 non-null float64 13 today 3911 non-null datetime64[ns] 14 age 3911 non-null float64 15 address 3907 non-null object 16 postcode 3907 non-null float64 17 state 3907 non-null object 18 country 3907 non-null object 19 property_valuation 3907 non-null float64 20 age_group 3911 non-null object dtypes: datetime64[ns](2), float64(4), int64(2), object(13) memory usage: 672.2+ KB
# check distribution of users in age groups
customer_demographic_new['age_group'].value_counts()
36-50 1646 50-75 1398 21-35 863 >75 4 Name: age_group, dtype: int64
The data is distributed consistently, the main part of users is between 36-75 years old
# join transactions info to custumers info
transactions_new=transactions.merge(customer_demographic_new, on='customer_id', how='left')
transactions_new['age_group'].value_counts()
36-50 8282 50-75 6879 21-35 4351 >75 20 Name: age_group, dtype: int64
transactions_new.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 20000 entries, 0 to 19999 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 transaction_id 20000 non-null int64 1 product_id 20000 non-null int64 2 customer_id 20000 non-null int64 3 transaction_date 20000 non-null datetime64[ns] 4 online_order 19640 non-null float64 5 order_status 20000 non-null object 6 brand 19803 non-null object 7 product_line 19803 non-null object 8 product_class 19803 non-null object 9 product_size 19803 non-null object 10 list_price 20000 non-null float64 11 standard_cost 19803 non-null float64 12 product_first_sold_date 19803 non-null datetime64[ns] 13 margin 19803 non-null float64 14 first_name 19532 non-null object 15 last_name 18906 non-null object 16 gender 19532 non-null object 17 past_3_years_bike_related_purchases 19532 non-null float64 18 DOB 19532 non-null datetime64[ns] 19 job_title 17177 non-null object 20 job_industry_category 16303 non-null object 21 wealth_segment 19532 non-null object 22 deceased_indicator 19532 non-null object 23 default 18536 non-null object 24 owns_car 19532 non-null object 25 tenure 19532 non-null float64 26 today 19532 non-null datetime64[ns] 27 age 19532 non-null float64 28 address 19503 non-null object 29 postcode 19503 non-null float64 30 state 19503 non-null object 31 country 19503 non-null object 32 property_valuation 19503 non-null float64 33 age_group 19532 non-null object dtypes: datetime64[ns](4), float64(9), int64(3), object(18) memory usage: 5.3+ MB
Transactions among age groups are distributed according to the amount of users in each group
# create function for grouping data
def grouped_data(param, value, func1, func2=None):
pivot=transactions_new.groupby(param)[value].agg([func1, func2])
return pivot
# check how median list price depends on age group and gender
grouped_gender=grouped_data(['age_group', 'gender'],'list_price', 'count', 'median').reset_index()
grouped_gender
| age_group | gender | count | median | |
|---|---|---|---|---|
| 0 | 21-35 | Female | 2185 | 1163.89 |
| 1 | 21-35 | Male | 2166 | 1151.96 |
| 2 | 36-50 | Female | 4229 | 1148.64 |
| 3 | 36-50 | Male | 4053 | 1172.78 |
| 4 | 50-75 | Female | 3589 | 1163.89 |
| 5 | 50-75 | Male | 3290 | 1163.89 |
| 6 | >75 | Female | 8 | 979.34 |
| 7 | >75 | Male | 12 | 1209.49 |
fig= px.bar(grouped_gender, x='count', y='age_group', color='gender', text='median')
fig.update_layout(title='Distribution of age group',
xaxis_title='Number of transactions',
yaxis_title='',
barmode='stack',
yaxis={'categoryorder':'total ascending'}, title_x=0.5)
# fig.add_trace(go.Bar(x = labels, y = values - values2, text=values - values2, textposition='outside'))
fig.show()
On average, expenses in each age group are the same. Number of men and women in most groups is the same, users are older 75 years men, and their amount is not high
# chech how median list price depends on job industry category
grouped_industry=grouped_data('job_industry_category','list_price', 'sum', 'median')
grouped_industry.sort_values(by='sum', ascending=False)
| sum | median | |
|---|---|---|
| job_industry_category | ||
| Manufacturing | 4422166.41 | 1163.89 |
| Financial Services | 4288265.20 | 1163.89 |
| Health | 3387936.62 | 1151.96 |
| Retail | 1968148.98 | 1198.46 |
| Property | 1436242.47 | 1151.96 |
| Entertainment | 784287.64 | 1172.78 |
| IT | 774993.82 | 1172.78 |
| Argiculture | 617353.97 | 1071.23 |
| Telecommunications | 375236.00 | 1148.64 |
fig= px.bar(grouped_industry, x='sum', y=grouped_industry.index, color=grouped_industry.index, text='median')
fig.update_layout(title='Distribution of median price',
xaxis_title='Number of transactions',
yaxis_title='',
barmode='stack',
yaxis={'categoryorder':'total ascending'}, title_x=0.5)
fig.show()
On average, median list price for all industry categories is almost the same, users with manufacturing and financial occupations spent more than other users
# check how wealth segments are distributed depending on occupation
grouped_wealth=grouped_data(['job_industry_category','wealth_segment'], 'list_price', 'count', 'median').reset_index()
fig= px.bar(grouped_wealth, x='count', y='job_industry_category', color='wealth_segment', text='median')
fig.update_layout(title='Distribution of wealth_segment',
xaxis_title='Number of transactions',
yaxis_title='',
barmode='stack',
yaxis={'categoryorder':'total ascending'}, title_x=0.5)
fig.show()
fig = px.histogram(grouped_wealth,
x='count',
y='job_industry_category',
barnorm = "percent",
color='wealth_segment',
color_discrete_sequence=px.colors.qualitative.Set3
)
fig.update_layout(title='Wealth segment distribution',
xaxis_title='Доля заведений',
yaxis_title='', title_x=0.5,
)
fig.show()
Median list price is almost the same for all wealth segments, main part of users is mass customers, the highest part of transactions made by affluent customers is in manufactiring and financial categories
# check users distribution among states
grouped_state=grouped_data(['state'], 'list_price', 'count', 'median')
grouped_state
| count | median | |
|---|---|---|
| state | ||
| New South Wales | 10435 | 1151.96 |
| Queensland | 4173 | 1163.89 |
| Victoria | 4895 | 1172.78 |
The biggest part of transactions were made by users from New South Wales. Median price is also almost the same.
grouped_brand=grouped_data(['brand', 'product_class'], 'list_price', 'count', 'median').reset_index()
grouped_brand
| brand | product_class | count | median | |
|---|---|---|---|---|
| 0 | Giant Bicycles | high | 785 | 1179.00 |
| 1 | Giant Bicycles | low | 196 | 590.26 |
| 2 | Giant Bicycles | medium | 2331 | 1403.50 |
| 3 | Norco Bicycles | high | 566 | 1148.64 |
| 4 | Norco Bicycles | low | 632 | 688.63 |
| 5 | Norco Bicycles | medium | 1712 | 1036.59 |
| 6 | OHM Cycles | high | 771 | 227.88 |
| 7 | OHM Cycles | low | 609 | 1073.07 |
| 8 | OHM Cycles | medium | 1663 | 912.52 |
| 9 | Solex | high | 566 | 1024.66 |
| 10 | Solex | low | 410 | 945.04 |
| 11 | Solex | medium | 3277 | 1071.23 |
| 12 | Trek Bicycles | high | 325 | 495.72 |
| 13 | Trek Bicycles | low | 786 | 980.37 |
| 14 | Trek Bicycles | medium | 1879 | 1469.44 |
| 15 | WeareA2B | low | 331 | 642.31 |
| 16 | WeareA2B | medium | 2964 | 1362.99 |
fig= px.bar(grouped_brand, x='count', y='brand', color='product_class', text='median')
fig.update_layout(title='Distribution of brands',
xaxis_title='Number of transactions',
yaxis_title='',
barmode='stack',
yaxis={'categoryorder':'total ascending'}, title_x=0.5)
fig.show()
grouped_data(['brand'], 'margin', 'count', 'median')
| count | median | |
|---|---|---|
| brand | ||
| Giant Bicycles | 3312 | 448.68 |
| Norco Bicycles | 2910 | 182.81 |
| OHM Cycles | 3043 | 199.10 |
| Solex | 4253 | 437.46 |
| Trek Bicycles | 2990 | 195.34 |
| WeareA2B | 3295 | 834.94 |
The most popular brand is Solex, but brand Giant Bicycles and WeareA2B provide the highest median price. Brand WeareA2B has the highest median margin
Based on conducted analysis we can assume that target users are from New South Wales with occupation in finance, manufacturing and health. The brands which should be offered are Giant Bicycles and WeareA2B, age group 35-75 years
According to analysis we conducted, we can see that median price doesn't depend on wealth segment, however, it's obvious that wealth users may provide more incomes for the company.
new=(new_customers
.query('state=="NSW" and (job_industry_category=="Manufacturing" or job_industry_category=="Financial Services")'))
new.query('age>=36 and age<=75')
| first_name | last_name | gender | past_3_years_bike_related_purchases | DOB | job_title | job_industry_category | wealth_segment | deceased_indicator | owns_car | ... | Unnamed: 16 | Unnamed: 17 | Unnamed: 18 | Unnamed: 19 | Unnamed: 20 | Rank | Value | today | age | age_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | Melinda | Hadlee | Female | 34 | 1965-09-21 | Financial Analyst | Financial Services | Affluent Customer | N | No | ... | 0.93 | 0.9300 | 1.162500 | 1.162500 | 4 | 4.0 | 1.703125 | 2023-08-19 10:13:28.845287 | 58.0 | 50-75 |
| 6 | Rutledge | Hallt | Male | 23 | 1976-10-06 | Compensation Analyst | Financial Services | Mass Customer | N | No | ... | 0.53 | 0.5300 | 0.530000 | 0.450500 | 6 | 6.0 | 1.671875 | 2023-08-19 10:13:28.845287 | 47.0 | 36-50 |
| 8 | Duff | Karlowicz | Male | 50 | 1972-04-28 | Speech Pathologist | Manufacturing | Mass Customer | N | Yes | ... | 0.68 | 0.8500 | 1.062500 | 0.903125 | 8 | 8.0 | 1.656250 | 2023-08-19 10:13:28.845287 | 51.0 | 50-75 |
| 15 | Dukie | Swire | Male | 88 | 1954-03-31 | NaN | Manufacturing | Affluent Customer | N | Yes | ... | 0.93 | 1.1625 | 1.453125 | 1.453125 | 16 | 16.0 | 1.562500 | 2023-08-19 10:13:28.845287 | 69.0 | 50-75 |
| 19 | Karly | Willavize | Female | 2 | 1954-08-12 | Internal Auditor | Manufacturing | High Net Worth | N | No | ... | 1.06 | 1.0600 | 1.325000 | 1.325000 | 19 | 19.0 | 1.531250 | 2023-08-19 10:13:28.845287 | 69.0 | 50-75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 960 | Sonia | Dunstall | Female | 0 | 1975-07-30 | Accountant III | Financial Services | Mass Customer | N | No | ... | 0.41 | 0.4100 | 0.512500 | 0.435625 | 960 | 960.0 | 0.442000 | 2023-08-19 10:13:28.845287 | 48.0 | 36-50 |
| 967 | Tillie | Bisseker | Female | 74 | 1960-08-10 | Civil Engineer | Manufacturing | High Net Worth | N | No | ... | 0.95 | 0.9500 | 1.187500 | 1.187500 | 967 | 967.0 | 0.435625 | 2023-08-19 10:13:28.845287 | 63.0 | 50-75 |
| 969 | Evered | Gludor | Male | 3 | 1951-04-30 | Electrical Engineer | Manufacturing | High Net Worth | N | Yes | ... | 0.72 | 0.9000 | 0.900000 | 0.900000 | 967 | 967.0 | 0.435625 | 2023-08-19 10:13:28.845287 | 72.0 | 50-75 |
| 993 | Weidar | Etheridge | Male | 38 | 1959-07-13 | Compensation Analyst | Financial Services | Mass Customer | N | Yes | ... | 0.48 | 0.6000 | 0.600000 | 0.510000 | 994 | 994.0 | 0.382500 | 2023-08-19 10:13:28.845287 | 64.0 | 50-75 |
| 995 | Ferdinand | Romanetti | Male | 60 | 1959-10-07 | Paralegal | Financial Services | Affluent Customer | N | No | ... | 0.84 | 0.8400 | 0.840000 | 0.840000 | 996 | 996.0 | 0.374000 | 2023-08-19 10:13:28.845287 | 64.0 | 50-75 |
124 rows × 26 columns
There are 124 new customers which can be considered as target. As brands Giant Bicycles and WeareA2B can provide the highest profit they can be promoted more intensively. At the same time it should be considered that some actions are required for improving sales among wealth customers
# Check hypotises that affluent customer spend more than mass customer
# H0: Expenses Affluent and mass customers are the same
# H1: Affluent customers spend more
sample_1=transactions_new.loc[transactions_new['wealth_segment']=='Affluent Customer', 'list_price']
sample_2=transactions_new.loc[transactions_new['wealth_segment']!='Mass Customer', 'list_price']
alpha = 0.05
results=st.ttest_ind(sample_1, sample_2, equal_var = False)
print('p-value:', results.pvalue)
if results.pvalue < alpha:
print("reject the null hypothesis")
else:
print("don't reject the null hypothesis")
p-value: 0.9531481272581976 don't reject the null hypothesis
As it was assumed earlier there is no significant difference in expenses between affluent and mass customers, so one of the task of future actions should be development of buisness with rich customers
During the research the following actions have been conducted:
During the research the following conslusions were established: